library(ggplot2)
library(dplyr)
library(reshape2)
library(fst)
library(haven)

load("all_sibs.rdata")

# create bars per year for local elections  

# create empty data frame
bar_data <- data_frame()

for(k in seq(1993, 2013, by = 4)){
  for(j in 1:2){
    
    var <- c("run_kv", "elected_kv")
    lab <- c("local candidate", "local winner")
    
    edu <- 
      read.fst(paste(work_data, "grunddata/", "udda", k, ".fst", sep = "")) %>% 
      #sample_n(2000000) %>% # for fitting
      mutate(edu_level = ifelse(education == "Grundskole", 1, 
                                ifelse(education == "Erhvervsfaglige praktik- og hovedforl?b", 2,
                                       ifelse(education == "Almengymnasiale uddannelser", 3, 
                                              ifelse(education == "Erhvervsgymnasiale uddannelser", 3, 
                                                     ifelse(education == "Korte videreg?ende uddannelser", 4, 
                                                            ifelse(education == "Mellemlange videreg?ende uddannelser", 5,  
                                                                   ifelse(education == "Bachelor", 5,  
                                                                          ifelse(education == "Lange videreg?ende uddannelser", 6,
                                                                                 ifelse(education == "Forskeruddannelser", 7, NA))))))))))
    
    
    full_sib$quant_year <- unlist(full_sib[,paste(var[j],"_", k, sep = "")])
    
    sibs_year <- 
      as.data.frame(full_sib) %>% 
      group_by(fe_sibship) %>% 
      summarise(pol_fam = sum(quant_year))
    
    sibs_year <- 
      sibs_year %>% 
      filter(pol_fam > 0) 
    
    sibs_pol <- 
      full_sib %>% 
      filter(fe_sibship %in% sibs_year$fe_sibship)
    
    data_an <-
      left_join(edu, sibs_pol, by = "PNR") %>% 
      filter(!is.na(.[,paste(var[j], "_", k, sep = "")]))
    
    plot_data <- 
      data_frame(type = lab[j],
                 edu_level  = 1:7, 
                 year = k) %>%
      left_join(., data_an %>% 
                  group_by(edu_level) %>% 
                  summarize(population_margin = n()) %>% 
                  mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin)), 
                by =  "edu_level") %>% 
      left_join(., data_an %>%
                  filter(.[,paste(var[j], "_", k, sep = "")] == 1) %>%
                  group_by(edu_level) %>% 
                  summarize(politician_margin = n()) %>% 
                  mutate(politician_margin = ifelse(is.na(politician_margin), 0, politician_margin)), 
                by =  "edu_level") %>% 
      left_join(., data_an %>%
                  filter(.[,paste(var[j], "_", k, sep = "")] == 0) %>%
                  group_by(edu_level) %>% 
                  summarize(sibling = n()) %>% 
                  mutate(sibling = ifelse(is.na(sibling), 0, sibling)),  
                by =  "edu_level")
    
    bar_data <-
      bar_data %>%
      bind_rows(plot_data)  
    
    print(k)
  }
}

# for parliamentarians:

bar_data_fv <- data_frame()

for(k in c(1990, 1994, 1998, 2001, 2005, 2007, 2011, 2015)){
  for(j in 1:2){
    
    var <- c("run_fv", "elected_fv")
    lab <- c("parliament candidate", "parliament winner")
    
    edu <- 
      read.fst(paste(work_data, "grunddata/", "udda", k, ".fst", sep = "")) %>% 
      #sample_n(2000000) %>% # for fitting
      mutate(edu_level = ifelse(education == "Grundskole", 1, 
                                ifelse(education == "Erhvervsfaglige praktik- og hovedforl?b", 2,
                                       ifelse(education == "Almengymnasiale uddannelser", 3, 
                                              ifelse(education == "Erhvervsgymnasiale uddannelser", 3, 
                                                     ifelse(education == "Korte videreg?ende uddannelser", 4, 
                                                            ifelse(education == "Mellemlange videreg?ende uddannelser", 5,  
                                                                   ifelse(education == "Bachelor", 5,  
                                                                          ifelse(education == "Lange videreg?ende uddannelser", 6,
                                                                                 ifelse(education == "Forskeruddannelser", 7, NA))))))))))
    
    
    if(j == 1) {full_sib$quant_year <- unlist(full_sib[,paste(var[j],"_", k, "_FV", sep = "")])}
    if(j == 2) {full_sib$quant_year <- unlist(full_sib[,paste(var[j],"_", k, sep = "")])}
    
    sibs_year <- 
      as.data.frame(full_sib) %>% 
      group_by(fe_sibship) %>% 
      summarise(pol_fam = sum(quant_year))
    
    sibs_year <- 
      sibs_year %>% 
      filter(pol_fam > 0) 
    
    sibs_pol <- 
      full_sib %>% 
      filter(fe_sibship %in% sibs_year$fe_sibship)
    
    if (j == 1){
      data_an <-
        left_join(edu, sibs_pol, by = "PNR") %>% 
        filter(!is.na(.[,paste(var[j], "_", k, "_FV", sep = "")]))
    }
    if (j == 2){
      data_an <-
        left_join(edu, sibs_pol, by = "PNR") %>% 
        filter(!is.na(.[,paste(var[j], "_", k, sep = "")]))
    }
    
    if (j == 1){
      plot_data <- 
        data_frame(type = lab[j],
                   edu_level  = 1:7, 
                   year = k) %>%
        left_join(., data_an %>% 
                    group_by(edu_level) %>% 
                    summarize(population_margin = n()) %>% 
                    mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin)), 
                  by =  "edu_level") %>% 
        left_join(., data_an %>%
                    filter(.[,paste(var[j], "_", k, "_FV", sep = "")] == 1) %>%
                    group_by(edu_level) %>% 
                    summarize(politician_margin = n()) %>% 
                    mutate(politician_margin = ifelse(is.na(politician_margin), 0, politician_margin)), 
                  by =  "edu_level") %>% 
        left_join(., data_an %>%
                    filter(.[,paste(var[j], "_", k, "_FV", sep = "")] == 0) %>%
                    group_by(edu_level) %>% 
                    summarize(sibling = n()) %>% 
                    mutate(sibling = ifelse(is.na(sibling), 0, sibling)),  
                  by =  "edu_level")  
    }
    
    if (j == 2){
      plot_data <- 
        data_frame(type = lab[j],
                   edu_level  = 1:7, 
                   year = k) %>%
        left_join(., data_an %>% 
                    group_by(edu_level) %>% 
                    summarize(population_margin = n()) %>% 
                    mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin)), 
                  by =  "edu_level") %>% 
        left_join(., data_an %>%
                    filter(.[,paste(var[j], "_", k, sep = "")] == 1) %>%
                    group_by(edu_level) %>% 
                    summarize(politician_margin = n()) %>% 
                    mutate(politician_margin = ifelse(is.na(politician_margin), 0, politician_margin)), 
                  by =  "edu_level") %>% 
        left_join(., data_an %>%
                    filter(.[,paste(var[j], "_", k, sep = "")] == 0) %>%
                    group_by(edu_level) %>% 
                    summarize(sibling = n()) %>% 
                    mutate(sibling = ifelse(is.na(sibling), 0, sibling)),  
                  by =  "edu_level")        
    }
    
    
    bar_data_fv <-
      bar_data_fv %>%
      bind_rows(plot_data)  
    
    print(k)
  }
}


bar_data <-
  as.data.frame(bar_data)

bar_data2 <- 
  melt(bar_data, 
       id = c("type", "edu_level", "year")) %>%
  group_by(type, edu_level, variable) %>% 
  summarise(count = sum(value)) 

# repeat for fv data
bar_data2_fv <- 
  as.data.frame(bar_data_fv) %>% 
  melt(., 
       id = c("type", "edu_level", "year")) %>%
  mutate(value = ifelse(is.na(value), 0, value)) %>%
  group_by(type, edu_level, variable) %>% 
  summarise(count = sum(value)) 

suppressWarnings({
  bar_data2 <- 
    bind_rows(bar_data2, bar_data2_fv) %>% 
    group_by(type, edu_level, variable) %>% 
    summarise(count = sum(count)) 
})

bar_data_count <- 
  bar_data2 %>% 
  group_by(type, variable) %>% 
  summarise(total =  sum(count))

bar_data2 <- 
  left_join(bar_data2, bar_data_count, by = c("type", "variable")) %>% 
  mutate(share = count / total)

# change order of factor

bar_data2 <- 
  bar_data2 %>%
  ungroup() %>%
  mutate(variable = factor(variable, levels = levels(variable)[c(1,3,2)]),
         type = as.factor(c(rep("Running for \nmunicipality", 21),
                            rep("Elected for \nmunicipality", 21),
                            rep("Running for \nparliament"  , 21), 
                            rep("Elected for \nparliament"  , 21))),
         type = factor(type, levels = levels(type)[c(3,1,4,2)]))
# create plot

plot <-
  ggplot(data = bar_data2 %>% filter(variable != "population_margin"),
         aes(x = edu_level, 
             y = share, 
             alpha = variable) ) +
  facet_grid(type ~ .) +
  geom_bar(stat = "identity", position = "dodge" ) +
  theme_classic() + 
  scale_alpha_discrete(range = c(0.2, 1), "", 
                       labels = c("Sibling", 
                                  "Politician")) + 
  scale_x_continuous("", breaks = 1:7,
                     labels = c("Primary \nschool",
                                "Vocational \ntraining",
                                "High \nschool",
                                "Short \ntertiary",
                                "Medium lenght \ntertiary",
                                "Long \ntertiary",
                                "PhD")) + 
  scale_y_continuous("") + 
  theme(legend.position = "top")

# add population data from main plot
load("edu_comp_data.rdata")
plot <- 
  plot + 
   geom_bar(data = bar_data2 %>% filter(variable == "population_margin"),
            aes(x = edu_level,
                y = share),
            stat = "identity", fill = "white", colour = "black", alpha = 1/10 )
